{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "P-AhVYeBWgQ3",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "# NOTE: PLEASE MAKE SURE YOU ARE RUNNING THIS IN A PYTHON3 ENVIRONMENT\n",
    "\n",
    "import tensorflow as tf\n",
    "print(tf.__version__)\n",
    "\n",
    "# This is needed for the iterator over the data\n",
    "# But not necessary if you have TF 2.0 installed\n",
    "#!pip install tensorflow==2.0.0-beta0\n",
    "\n",
    "\n",
    "#tf.enable_eager_execution()\n",
    "\n",
    "#!pip install -q tensorflow-datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "_IoM4VFxWpMR",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import tensorflow_datasets as tfds\n",
    "imdb, info = tfds.load(\"imdb_reviews\", with_info=True, as_supervised=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wHQ2Ko0zl7M4",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_data, test_data = imdb['train'], imdb['test']\n",
    "\n",
    "\n",
    "training_sentences = []\n",
    "training_labels = []\n",
    "\n",
    "testing_sentences = []  \n",
    "testing_labels = []\n",
    "\n",
    "\n",
    "# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()\n",
    "for s,l in train_data:\n",
    "  training_sentences.append(str(s.numpy()))\n",
    "  training_labels.append(l.numpy())\n",
    "  \n",
    "  \n",
    "for s,l in test_data:\n",
    "  testing_sentences.append(str(s.numpy()))\n",
    "  testing_labels.append(l.numpy())\n",
    " \n",
    "training_labels_final = np.array(training_labels)\n",
    "testing_labels_final = np.array(testing_labels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "7n15yyMdmoH1",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "vocab_size = 10000\n",
    "embedding_dim = 16\n",
    "max_length = 120\n",
    "trunc_type='post'\n",
    "oov_tok = \"<OOV>\"\n",
    "\n",
    "\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)\n",
    "tokenizer.fit_on_texts(training_sentences)\n",
    "word_index = tokenizer.word_index\n",
    "sequences = tokenizer.texts_to_sequences(training_sentences)\n",
    "padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type, padding='post')\n",
    "\n",
    "testing_sequences = tokenizer.texts_to_sequences(testing_sentences)\n",
    "testing_padded = pad_sequences(testing_sequences,maxlen=max_length)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "9axf0uIXVMhO",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])\n",
    "\n",
    "def decode_review(text):\n",
    "    return ' '.join([reverse_word_index.get(i, '?') for i in text])\n",
    "\n",
    "print(decode_review(padded[1]))\n",
    "print(training_sentences[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "5NEpdhb8AxID",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "model = tf.keras.Sequential([\n",
    "    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),\n",
    "    tf.keras.layers.Flatten(),\n",
    "    tf.keras.layers.Dense(6, activation='relu'),\n",
    "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
    "])\n",
    "model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
    "model.summary()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "V5LLrXC-uNX6",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "num_epochs = 10\n",
    "model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "yAmjJqEyCOF_",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "e = model.layers[0]\n",
    "weights = e.get_weights()[0]\n",
    "print(weights.shape) # shape: (vocab_size, embedding_dim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "jmB0Uxk0ycP6",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import io\n",
    "\n",
    "out_v = io.open('vecs.tsv', 'w', encoding='utf-8')\n",
    "out_m = io.open('meta.tsv', 'w', encoding='utf-8')\n",
    "for word_num in range(1, vocab_size):\n",
    "  word = reverse_word_index[word_num]\n",
    "  embeddings = weights[word_num]\n",
    "  out_m.write(word + \"\\n\")\n",
    "  out_v.write('\\t'.join([str(x) for x in embeddings]) + \"\\n\")\n",
    "out_v.close()\n",
    "out_m.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "VDeqpOCVydtq",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# try:\n",
    "#   from google.colab import files\n",
    "# except ImportError:\n",
    "#   pass\n",
    "# else:\n",
    "#   files.download('vecs.tsv')\n",
    "#   files.download('meta.tsv')"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Course 3 - Week 2 - Lesson 1.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
